from termcolor import colored
from sklearn.tree import DecisionTreeClassifier
import missingno as msno
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pickle
import pprint
from sklearn.ensemble import RandomForestRegressor
from pandas_profiling import ProfileReport
from dateutil import relativedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from statsmodels.regression.linear_model import OLS
plot_______ = False
plot_______ = True
def new_line():
print("\n-------------------------\n")
def RMSE(predictions):
return round(np.sqrt(((test_y - predictions)**2).mean()))
def plot_numerical_columns(col_name):
if not plot_______:
return None
# Histogram
df[col_name].plot(kind="hist", figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# Scatter plot
df[col_name].plot(figsize=(13,8));
plt.title(col_name, size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# scatter plot (sort by values), values Vs index
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(13,8));
plt.title(col_name+" (SORTED)", size=18);
plt.axhline(y=df[col_name].mean(), color='red');
plt.axhline(y=df[col_name].median(), color='green');
plt.legend(['Actual', 'Mean', 'Median']);
plt.show()
# box plot
df[col_name].plot(kind="box", figsize=(13,8))
plt.title(col_name, size=18);
plt.xlabel("");
plt.show()
def plot_date_columns(col_name):
if not plot_______:
return None
df[col_name].plot(figsize=(15,7), grid=True);
plt.xlabel("Index", size=14);
plt.ylabel("Date", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
df[col_name].sort_values().reset_index(drop=True).plot(figsize=(15,7), grid=True);
plt.xlabel("Index (sorted)", size=14);
plt.ylabel("Year", size=14);
plt.title(col_name + " Graph", size=18);
plt.show();
(df[col_name].dt.year.value_counts(sort=False).sort_index() / len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Year", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " year Frequency Graph", size=18);
plt.show();
(df[col_name].dt.month.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Month", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " month Frequency Graph", size=18);
plt.show();
(df[col_name].dt.day.value_counts().sort_index()/len(df) * 100).plot(kind="bar", figsize=(15,7), grid=True);
plt.xlabel("Day", size=14);
plt.ylabel("Ratio (1-100)", size=14);
plt.title(col_name + " Day Frequency Graph", size=18);
plt.show();
def plot_catagorical_columns(cat_variable):
if not plot_______:
return None
(df[cat_variable].value_counts() / len(df) * 100).plot.bar(figsize=(15,6), grid=True);
plt.title(cat_variable, size=18, color='r');
plt.xlabel("Catagory", size=14, color='r');
plt.ylabel("Ratio (1-100)", size=14, color='r');
plt.show()
def data_shape():
return f"The Data have:\n\t{df.shape[0]} rows\n\t{df.shape[1]} columns\n"
#===
# df = pd.read_csv("data.csv", date_parser=True)
# df = pd.read_csv("df_only_selected_columns_using_PCA.csv", date_parser=True)
# target_variable = "ACTUAL_WORTH"
# df = pd.concat([
# df.select_dtypes("number").iloc[:, :3],
# df.select_dtypes("O").iloc[:, :3],
# df.select_dtypes(exclude=["number", "O"]),
# df[[target_variable]]], 1)
# target_variable = "AREA_NAME_EN"
# df = pd.read_csv("cleaned_data.csv", date_parser=True)
# target_variable = "SalePrice"
train = pd.read_csv("/home/amir/Downloads/train.csv")
test = pd.read_csv("/home/amir/Downloads/test.csv")
target_variable = "SalePrice"
train_y = train[target_variable]
train = train.drop(columns=target_variable)
df = pd.concat([train, test])
df[target_variable] = train_y.to_list() + [None]*len(test)
#===
new_line()
print(data_shape())
#===
new_line()
print(f"Columns types distribution:\n\n{df.dtypes.value_counts()}\n")
df.dtypes.value_counts().plot(kind='barh', figsize=(10, 2), grid=True, title="Variable types Count Graph");
plt.xlabel("Count");
plt.show()
#===
f = df[target_variable].isna().sum()
if f:
new_line()
to_print = f"There are {f} NAs in target values, we droped those rows"
print(colored(to_print, 'red'))
df = df[df[target_variable].notna()]
del f
#---------------------------------------------------
# df.select_dtypes("O").columns[:5]
# D = df.select_dtypes(exclude="O")
# D2 = df.select_dtypes("O").iloc[:,:5]
# df = pd.concat([D, D2], 1)
# profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
# profile.to_file("your_report.html")
#---------------------------------------- NA
a = df.isna().sum().where(lambda x:x>0).dropna()
if a.size:
new_line()
to_print = f"There are {len(a)} (out of {df.shape[1]}, [{round(len(a)/df.shape[1]*100)}%]) columns that contains 1 or more NA."
print(colored(to_print, 'red'))
for i in a.index:
df[i+"_NA_indicator"] = df[i].isna().replace({True : "Missing", False : "Not missing"})
new_line()
to_print = f"{a.size} NA_indicator variables added to the data\n"
print(colored(to_print, 'red'))
print("========= NA Graphs =========\n")
msno.matrix(df);
plt.title("NA Graph");
plt.show()
new_line()
sns.heatmap(df.isnull(), cbar=False);
plt.title("NA Graph");
plt.show()
#===
a = a.sort_values()/len(df)*100
if (a == 100).sum():
new_line()
df.drop(columns=a[a==100].index, inplace=True)
to_print = f"There are {(a == 100).sum()} columns that are all Missing values, so we droped those.\nNow {data_shape()}\n\nDropped columns names:"
print(colored(to_print, 'red'))
for i in a[a==100].index:
print("\t",i)
a = a[a != 100]
#===
x = df[a.index].dtypes.value_counts()
if x.size:
new_line()
print(f"NA columns data type Distribution:\n\n{x}")
del x
#===
new_line()
if a.size:
print(f"NaN Ratio (0-100)\n\n{a}")
else:
print(colored("Now There is no NaN value in our Data", 'red'))
#===
# ----------------------------------------------- Imputing Missing values
# ------------------------------------ Numerical columns imputing
if df.select_dtypes("number").isna().sum().sum():
new_line()
print(f'(Before Missing values treatment)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
from sklearn.impute import KNNImputer
df_not_a_number = df.select_dtypes(exclude="number")
df_number = df.select_dtypes("number")
del df
imputer = KNNImputer(n_neighbors=4, weights="uniform")
imputed = imputer.fit_transform(df_number)
df_number = pd.DataFrame(imputed, columns=df_number.columns)
df = pd.concat([df_not_a_number.reset_index(drop=True), df_number.reset_index(drop=True)], axis=1)
del df_not_a_number
del df_number
print(f'\n(After filling numeric missing values)\nThere are {df.isna().sum().sum()} Missing values:\n\t{df.select_dtypes("O").isna().sum().sum()} in catagorical variables\n\t{df.select_dtypes("number").isna().sum().sum()} in numerical columns\n\t{df.select_dtypes(exclude=["O", "number"]).isna().sum().sum()} in others')
#===
# -------------------------------- Catagoriacal variables imputating
vars_to_fill = df.select_dtypes("O").isna().mean().where(lambda x:x>0).dropna().sort_values(ascending=True)
if vars_to_fill.size:
for col in vars_to_fill.index:
tr = pd.concat([df[[col]], df.loc[:,df.isna().sum() == 0]], 1)
tr_y = tr[col]
tr_X = tr.drop(columns=col)
tr_T = tr_X.select_dtypes("number")
cat_cols = pd.get_dummies(tr_X.select_dtypes(exclude="number"), prefix_sep="__")
tr_T[cat_cols.columns.to_list()] = cat_cols
tr_T[col] = tr_y
tr = tr_T.copy("deep")
train = tr[tr[col].notna()]
test = tr[tr[col].isna()]
train_y = train[col]
train_X = train.drop(columns=col)
test_X = test.drop(columns=col)
clf = DecisionTreeClassifier().fit(train_X, train_y)
test_y = clf.predict(test_X)
df.loc[df[col].isna(), col] = test_y
new_line()
print(f"Missing values imputed, Now there are {df.isna().sum().sum()} Missing values")
# ----------------------------------------------- END Imputing Missing values
# --------------------------------------------------------- Unique values
only_one_unique_value = df.nunique().where(lambda x:x == 1).dropna()
if only_one_unique_value.size:
new_line()
df.drop(columns=only_one_unique_value.index, inplace=True)
last_ = ("", "it") if only_one_unique_value.size == 1 else ("s", "those")
to_print = f"There are {only_one_unique_value.size} variable{last_[0]} That have only one unique value, so we droped {last_[1]}.\nDropped column{last_[0]} name{last_[0]} (in order):"
print(colored(to_print, 'red'))
for i in only_one_unique_value.index.sort_values():
print(i)
new_line()
print(f"\nNow {data_shape()}")
del only_one_unique_value
# #===
all_values_are_unique = df.apply(lambda x:x.is_unique).where(lambda x:x==True).dropna()
if all_values_are_unique.size:
new_line()
df.drop(columns=all_values_are_unique.index, inplace=True)
last_ = ("", "it") if all_values_are_unique.size == 1 else ("s", "those")
to_print = f"There are {all_values_are_unique.size} column{last_[0]} that have all unique values, so no value repeatation, we droped {last_[1]} column{last_[0]}.\nDropped column{last_[0]} name{last_[0]} are:\n"
print(colored(to_print, 'red'))
for i in all_values_are_unique.index:
print("\t", i)
new_line()
print(f"Now {data_shape()}")
del all_values_are_unique
#===
date_columns = []
def DTYPES():
global date_columns
catagorical_columns = df.head().select_dtypes("O").columns
numerical_columns = df.head().select_dtypes("number").columns
date_columns = []
for i in catagorical_columns:
try:
df[i] = pd.to_datetime(df[i])
date_columns.append(i)
except:
pass
catagorical_columns = catagorical_columns.drop(date_columns)
if date_columns:
date_columns = pd.Index(date_columns)
#===
if not catagorical_columns.append(numerical_columns).append(date_columns).is_unique:
new_line()
print(colored("Some column/s repated in > 1 dtypes\n", 'red'))
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['O']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
print(dtypes[dtypes.Column.isin(list(dtypes[dtypes.Column.duplicated()].Column.values))].to_string())
#===
x = df.columns.difference(
catagorical_columns.append(numerical_columns).append(date_columns)
)
if x.size:
new_line()
print(colored("Some columns not included in any existing catagory, those:\n", 'red'))
for i in x:
print(f"\t<{i}, with dtype of <{df[i].dtype}>")
#===
dtypes = pd.DataFrame({"Column" : catagorical_columns.append(numerical_columns).append(date_columns),
"dtype" : ['Object']*len(catagorical_columns) + ['Number']*len(numerical_columns) + ['Date']*len(date_columns)})
return dtypes
#===
dtypes = DTYPES()
# ----------------------------------------------------------------------- Feature enginearing
# ======= Adding date columns
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> add polynomial, sqrt, tree, log features
def add_new_date_cols(x, suffix):
d = {}
d[suffix + '_week_normalized'] = x.dt.week / 52
d[suffix + '_week_str'] = '"' + x.dt.week.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_year_after_min_year'] = x.dt.year - x.dt.year.min()
d[suffix + '_year_str'] = '"' + x.dt.year.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_day_name'] = x.dt.day_name()
d[suffix + '_day_after_min_date_str'] = '"' + (x - x.min()).apply(lambda x: str(x).split()[0]) + '"'
d[suffix + '_day_normalized'] = x.dt.day / 31
d[suffix + '_hour_normalized'] = x.dt.hour / 24
d[suffix + '_hour_str'] = '"' + x.dt.hour.apply(lambda x:np.nan if np.isnan(x) else str(x).replace(".0", "")) + '"'
d[suffix + '_month_name'] = x.dt.month_name()
d[suffix + '_month_normalized'] = x.dt.month/12
for k,v in d.items():
if v.nunique() > 1:
df[k] = v
return df.drop(columns=x.name)
# return df
len_df_before_adding_date_vars = df.shape[1]
for date_col in date_columns:
df = add_new_date_cols(df[date_col], date_col)
len_df_after_adding_date_vars = df.shape[1]
if len_df_after_adding_date_vars > len_df_before_adding_date_vars:
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} date Features"
print(colored(to_print, 'red'))
# ======= type casting of numerical variable (those who have < 4% unique values) to catagorical variables
f = (df.select_dtypes("number").nunique() / len(df) * 100).where(lambda x:x<4).dropna().index
if f.size:
len_df_before_adding_date_vars = df.shape[1]
for col_num_to_str in f:
df[col_num_to_str+"_str"] = '"' + df[col_num_to_str].astype(str) + '"'
len_df_after_adding_date_vars = df.shape[1]
new_line()
to_print = f"Added {len_df_after_adding_date_vars - len_df_before_adding_date_vars} String Features (Extracted from numerical variables)"
print(colored(to_print, 'red'))
# =======
def cluping_rare_cases_in_one_catagory(x):
global df
x = df[x]
orignal = x.copy("deep")
xx = x.value_counts()
xx = xx[xx< 10].index.to_list()
x = x.replace(xx , "Rare cases")
if x.value_counts()[-1] < 8:
x[x == "Rare cases"] = x.mode()[0] # agar "Rare cases" vali catogery me 8 sy bhi kam values hon to un ko most common value sy replace kar do
if x.nunique() == 1:
new_line()
to_print = f"The column <{x.name}> have only one unique value, We droped it from the data."
print(colored(to_print, 'red'))
# return orignal
df.drop(columns=x.name, inplace=True)
return None
return x
for var in df.select_dtypes("O").columns:
m = cluping_rare_cases_in_one_catagory(var)
if isinstance(m, pd.core.series.Series):
df[var] = m
new_line()
xx = (df == 'Rare cases').sum().sort_values().where(lambda x:x>0).dropna()
xx = pd.DataFrame({"Count" : xx,
"Ratio" : round(xx/len(df)*100, 4)})
print(f"<Rare case> catagory:\n{xx.to_string()}")
# ----------------------------------------------------------------------- END (Feature enginearing)
dtypes = DTYPES()
# ---------------------------------------------------- Correlation plot
new_line()
cor_df = df.select_dtypes('number').corr().abs()
mask = np.triu(np.ones_like(cor_df, dtype=bool));
f, ax = plt.subplots(figsize=(17, 10));
cmap = sns.color_palette("viridis", as_cmap=True);
plot_ = sns.heatmap(cor_df, mask=mask, cmap=cmap, vmax=.3, square=True, linewidths=.5, cbar_kws={"shrink": .5});
plot_.axes.set_title("abs (Correlation) plot",fontsize=25);
plt.show()
# ---------------------------------------------------------------------
#===
# m = 0
for row in dtypes.iterrows():
# m += 1
# if m == 3:
# break
column_name, type_ = row[1]
x = df[column_name]
to_print = "f\n\n\n========================================= {column_name} =========================================\n\n"
print(colored(to_print, 'red'))
for col_ in df.columns:
if col_ == column_name:
continue
if df[col_].nunique() == df[column_name].nunique():
unique_combination = df[[col_, column_name]].drop_duplicates()
if unique_combination.apply(lambda x:x.is_unique).sum() == 2:
new_line()
to_print = f"This Columns is duplicate of <{col_}> column"
print(colored(to_print, 'red'))
# print(f"Column Type : {type_}")
print(f"Column Type : ", end="")
print(colored(type_, 'red'))
if x.isna().all():
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because it is all Empty", 'red'))
continue
if type_ in ["O", "Date"]:
if x.is_unique:
new_line()
df.drop(columns=column_name, inplace=True)
to_print = f"We dropped This column, because it's a {type_} columns, and it's all values are unique"
print(colored(to_print, 'red'))
continue
if x.nunique() == 1:
new_line()
df.drop(columns=column_name, inplace=True)
print(colored("We dropped This column, because There is only one unique value", 'red'))
continue
if type_ == "Number":
local_cor = cor_df[column_name].drop(column_name).reset_index()
local_cor = local_cor.reindex(local_cor[column_name].abs().sort_values().index)
if local_cor[column_name].max() == 1:
new_line()
to_print = f"This column is perfactly correlated with column <{local_cor[local_cor[column_name] == 1]['index'].values[0]}, so remove one of them"
print(colored(to_print, 'red'))
new_line()
xm = local_cor[-3:].rename(columns={'index' : 'Column name', column_name : 'Correlation'}).reset_index(drop=True)
xm.index = xm['Column name']
xm.drop(columns="Column name", inplace=True);
xm.plot(kind='barh', grid=True, figsize=(10,1.5));
plt.title("Most 3 correlated features with this columns (sorted)", size=14);
plt.xlabel("Correlation", size=12);
plt.show();
new_line()
skewness = x.skew(skipna = True)
if abs(skewness) < 0.5:
print(f"The data is fairly symmetrical (skewness is: {skewness})")
elif abs(skewness) < 1:
print(f"The data are moderately skewed (skewness is: {skewness})")
else:
to_print = f"The data are highly skewed (skewness is: {skewness})\nNote: When skewness exceed |1| we called it highly skewed"
print(colored(to_print, 'red'))
# f = x.describe()
# f['Nunique'] = x.nunique()
# f['Nunique ratio'] = f.loc["Nunique"] / f.loc["count"] * 100
# f['Outlies count'] = (((x - x.mean())/x.std()).abs() > 3).sum()
# f['Outlies ratio'] = f.loc["Outlies count"] / f.loc["count"] * 100
# f['Nagative values count'] = (x < 0).sum()
# f['Nagative values ratio'] = f['Nagative values count'] / f['count'] * 100
ff = [x.count(), x.isna().sum(), x.mean(), x.std(), x.min()]
ff += x.quantile([.25,.5,.75]).to_list()
ff += [x.max(), x.nunique(), (((x - x.mean())/x.std()).abs() > 3).sum(), (x < 0).sum(), (x == 0).sum()]
f = pd.DataFrame(ff, index=['Count', 'NA', 'Mean', 'Std', 'Min', '25%', '50%', '75%', 'Max', 'Nunique', 'Outlies', 'Nagetive', 'Zeros'], columns=['Count'])
f['Ratio'] = f.Count / x.count() * 100
f.loc['Mean' : 'Max', 'Ratio'] = None
new_line()
print(f.round(2).to_string())
plot_numerical_columns(column_name)
elif type_ == "Object":
# f = x.describe()
# f = x.agg(['count', pd.Series.nunique])
# f['len'] = len(x)
# f['Na count'] = x.isna().sum()
# f['Na ratio'] = f['Na count'] / f['count'] * 100
# f['Most frequent'] = x.mode().values[0]
# f['Most frequent count'] = (x == f['Most frequent']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['count'] * 100
# f['Least frequent'] = x.value_counts().tail(1).index[0]
# f['Least frequent count'] = (x == f['Least frequent']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
l = x.count(), x.nunique(), len(x), x.isna().sum(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(l, index=['Count', 'Nunique', 'Len', 'NA', 'Most frequent', 'Least frequent', 'Values occured only once'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
f.loc[['Len'], 'Ratio'] = None
new_line()
print(f.to_string())
if x.str.lower().nunique() != x.nunique():
new_line()
to_print = f"Case issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin lower verstion there are {x.str.lower().nunique()} unique values.\n"
print(colored(to_print, 'red'))
if x.str.strip().nunique() != x.nunique():
new_line()
to_print = f"Space issue\n\tin orignal variable There are {x.nunique()} unique values\n\tin striped verstion there are {x.str.strip().nunique()} unique values."
print(colored(to_print, 'red'))
plot_catagorical_columns(column_name)
elif type == "Date":
new_line()
rd = relativedelta.relativedelta( pd.to_datetime(x.max()), pd.to_datetime(x.min()))
to_print = f"Diffrenece between first and last date:\n\tYears : {rd.years}\n\tMonths: {rd.months}\n\tDays : {rd.days}"
print(colored(to_print, 'red'))
# f = pd.Series({'Count' : x.count(),
# 'Nunique count' : x.nunique(),
# 'Nunique ratio' : x.nunique() / x.count() * 100,
# 'Most frequent value' : str(x.mode()[0]),
# 'Least frequent value' : x.value_counts().tail(1).index[0]
# })
# f['Most frequent count'] = (x == f['Most frequent value']).sum()
# f['Most frequent ratio'] = f['Most frequent count'] / f['Count'] * 100
# f['Least frequent count'] = (x == f['Least frequent value']).sum()
# f['Least frequent ratio'] = f['Least frequent count'] / f['Count'] * 100
# f['Values occured only once count'] = x.value_counts().where(lambda x:x==1).dropna().size
# f['Values occured only once Ratio'] = f['Values occured only once count'] / x.count() * 100
ff = x.count(), x.nunique(), (x == x.mode().values[0]).sum(), (x == x.value_counts().tail(1).index[0]).sum(), x.value_counts().where(lambda x:x==1).dropna().size
f = pd.DataFrame(ff, index=["Count", 'Nunique', 'Most frequent values', 'Least frequent values', 'Values occured only once count'], columns=['Counts'])
f['Ratio'] = (f.Counts / x.count() * 100).round(4)
new_line()
print(f"\n{f.to_string()}")
f = set(np.arange(x.dt.year.min(),x.dt.year.max()+1)).difference(
x.dt.year.unique())
if f:
new_line()
print(colored("These Years (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.month.min(),x.dt.month.max()+1)).difference(
x.dt.month.unique())
if f:
new_line()
print(colored("These Months (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
f = set(np.arange(x.dt.day.min(),x.dt.day.max()+1)).difference(
x.dt.day.unique())
if f:
new_line()
print(colored("These Days (in order) are missing:\n", 'red'))
for i in f:
print("\t", i, end=", ")
new_line()
plot_date_columns(column_name)
# ================================================================================================================ Modeling
print("\n\n")
print("----------------------------------------------------------------------------------------------")
print("****************************************** Modeling ******************************************")
# Regression problem
if df[target_variable].dtype in [float, int]:
print("\n-------------------- This is Regression problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df_T = df.select_dtypes("number")
cat_cols = pd.get_dummies(df.select_dtypes(exclude="number"), prefix_sep="__")
df_T[cat_cols.columns.to_list()] = cat_cols
df = df_T.copy("deep")
del df_T
del cat_cols
# ====
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
# ====
# --------------------------------------------------------- Linear regression
to_print = "\n ------------------------------------- Linear Regression -------------------------------------\n"
print(colored(to_print, 'red'))
model_reg = OLS(train_y, train_X).fit()
summary = model_reg.summary()
summary_df = pd.DataFrame(summary.tables[1])
summary_df.columns = summary_df.iloc[0]
summary_df.drop(0, inplace=True)
summary_df.columns = summary_df.columns.astype(str)
summary_df.columns = ["Variable"] + summary_df.columns[1:].to_list()
for i in summary_df.columns[1:]:
summary_df[i] = summary_df[i].astype(str).astype(float)
summary_df.Variable = summary_df.Variable.astype(str)
summary_df['Indicator'] = summary_df['P>|t|'].apply(lambda x:"***" if x < 0.001 else "**" if x < 0.01 else "*" if x < 0.05 else "." if x < 0.1 else "")
summary_df = summary_df.sort_values("Variable").reset_index(drop=True)
summary_df.to_csv()
new_line()
print(colored("NOTE: This summary saved as <summary_OLS_1.csv>", 'red'))
new_line()
print(summary_df.to_string())
# ============================= Model statistic
predictions = model_reg.predict(test_X)
new_line()
print(colored(" --- Model statistic --- \n", 'red'))
print(f"R-squared : {round(model_reg.rsquared, 3)}")
print(f"Adj. R-squared : {round(model_reg.rsquared_adj, 3)}")
print(f"F-statistic : {round(model_reg.fvalue)}")
print(f"Prob (F-statistic): {model_reg.f_pvalue}")
print(f"No. Observations : {round(model_reg.nobs)}")
print(f"AIC : {round(model_reg.aic)}")
print(f"Df Residuals : {round(model_reg.df_resid)}")
print(f"BIC : {round(model_reg.bic)}")
print(f"RMSE (test) : {RMSE(predictions)}")
# ======
f = train_X.copy("deep")
f['Errors__'] = model_reg.resid
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
print(f"Mean of train reseduals: {model_reg.resid.mean()}")
del f
# ============================= END (Model statistic)
# --------------------------------------------------------- END Linear regression
# --------------------------------------------------------- Random Forest
print("\n ------------------------------------- Random Forest -------------------------------------\n")
rf = RandomForestRegressor(n_estimators = 200, oob_score=True)
model_rf = rf.fit(train_X, train_y);
predictions_rf = rf.predict(test_X)
new_line()
print(colored("RF model peramters:\n", 'red'))
pprint.pprint(model_rf.get_params())
new_line()
importances = list(rf.feature_importances_)
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(test_X, importances)]
featuresImportance = pd.Series(model_rf.feature_importances_, index=train_X.columns).sort_values(ascending=False)
if len(featuresImportance) > 30:
featuresImportance = featuresImportance.head(30)
featuresImportance.plot(figsize=(20,10), kind='bar', grid=True);
plt.title("RandomForest Feature importances Graph", size=18,color='red');
plt.xlabel("Features", size=14, color='red');
plt.ylabel("Importance", size=14, color='red');
plt.show();
del featuresImportance
new_line()
print(colored("--- Model statistic ---", 'red'))
# The coefficient of determination R^2 of the prediction.
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
print(f"R^2 (test) : {rf.score(test_X, test_y)}")
print(f"R^2 (train): {rf.score(train_X, train_y)}")
print(f"RMSE (test): {RMSE(predictions_rf)}")
print(f"oob score : {model_rf.oob_score_}")
f = test_X.copy("deep")
errors_rf = predictions_rf - test_y
f['Errors__'] = errors_rf
f = f.corr()['Errors__'].drop("Errors__").abs().sort_values().dropna().tail(1)
new_line()
print(f"Maximum correlation between Reseduals and any data columns is {f.values[0]}, with columns <{f.index[0]}>")
# --------------------------------------------------------- END Random Forest
elif df[target_variable].dtype == "O":
# Classififcation problem
if df[target_variable].nunique() == 2:
print("\n-------------------- This is Binary Classification problem --------------------\n")
print("''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df = pd.concat([
df.select_dtypes(exclude = "O"),
pd.get_dummies(df.drop(columns=target_variable).select_dtypes("O")),
df[[target_variable]]
], 1)
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf = LogisticRegression().fit(train_X, train_y)
predictions = clf.predict_proba(test_X)
predictions = pd.Series(predictions[:, 0])
lst = []
for thresh in np.linspace(predictions.min(), predictions.max(), 50)[1:]:
pred = predictions < thresh
pred.loc[pred == True] = clf.classes_[0]
pred.loc[pred == False] = clf.classes_[1]
test_y = test_y.reset_index(drop=True)
TN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[0])).sum()
TP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[1])).sum()
FN = ((pred == clf.classes_[0]) & (test_y == clf.classes_[1])).sum()
FP = ((pred == clf.classes_[1]) & (test_y == clf.classes_[0])).sum()
p = TP / (TP + FP)
r = TP / (TP + FN)
f = 2 * ((p * r) / (p+r))
lst.append((thresh, (pred == test_y).mean(), p, r , f))
d = pd.DataFrame(lst, columns=["Thresold", "Accurecy(0-1)", "Precision", "Recall", "F1"])
d = d.set_index("Thresold")
d.plot(grid=True, figsize=(18,7));
plt.title("Model performance at diffrent Thresolds", size=18, color='red');
plt.xlabel("Thresold", size=14, color='red');
plt.ylabel("");
plt.show()
else:
to_print = "\n-------------------- This is Multiclass Classification problem --------------------\n"
print(colored(to_print, 'red'))
print("'''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''")
df.loc[:, df.select_dtypes("O").columns] = df.select_dtypes("O").apply(lambda x: pd.Series(LabelEncoder().fit_transform(x.astype(str))).astype(str))
train_X, test_X, train_y, test_y = train_test_split(df.drop(columns=target_variable), df[target_variable])
clf=RandomForestClassifier(n_estimators=1000).fit(train_X, train_y)
predictions = clf.predict(test_X)
feature_imp = pd.Series(clf.feature_importances_,index=train_X.columns).sort_values(ascending=False)
if feature_imp.size > 30:
feature_imp = feature_imp.head(30)
feature_imp.plot(kind='barh', figsize=(17,10), grid=True);
plt.title("Feature importances Graph", size=18, color='red');
plt.xlabel("Importance", size=14, color='red');
plt.ylabel("Feature", size=14, color='red');
plt.show()
# ====
f = (test_y, predictions)
f_int = (test_y.astype(int), predictions.astype(int))
print(f"accuracy_score: {metrics.accuracy_score(*f)}")
print(f"f1_score: {metrics.f1_score(*f_int)}")
metrics.plot_roc_curve(clf, test_X, test_y);
plt.title("ROC curve plot");
plt.show();
metrics.ConfusionMatrixDisplay(metrics.confusion_matrix(*f)); plt.show()
metrics.plot_confusion_matrix(clf, test_X, test_y);
plt.title("Confusion matrix");
plt.show()
metrics.plot_precision_recall_curve(clf, test_X, test_y);
plt.title("Precision recall curve");
plt.show()
# ================================================================================================================ END Modeling
------------------------- The Data have: 2919 rows 81 columns ------------------------- Columns types distribution: object 43 int64 26 float64 12 dtype: int64
------------------------- There are 1459 NAs in target values, we droped those rows ------------------------- There are 19 (out of 81, [23%]) columns that contains 1 or more NA. ------------------------- 19 NA_indicator variables added to the data ========= NA Graphs =========
-------------------------
------------------------- NA columns data type Distribution: object 16 float64 3 dtype: int64 ------------------------- NaN Ratio (0-100) Electrical 0.068493 MasVnrType 0.547945 MasVnrArea 0.547945 BsmtQual 2.534247 BsmtCond 2.534247 BsmtFinType1 2.534247 BsmtExposure 2.602740 BsmtFinType2 2.602740 GarageCond 5.547945 GarageQual 5.547945 GarageFinish 5.547945 GarageType 5.547945 GarageYrBlt 5.547945 LotFrontage 17.739726 FireplaceQu 47.260274 Fence 80.753425 Alley 93.767123 MiscFeature 96.301370 PoolQC 99.520548 dtype: float64 ------------------------- (Before Missing values treatment) There are 6965 Missing values: 6617 in catagorical variables 348 in numerical columns 0.0 in others (After filling numeric missing values) There are 6617 Missing values: 6617 in catagorical variables 0 in numerical columns 0.0 in others ------------------------- Missing values imputed, Now there are 0 Missing values ------------------------- There are 1 column that have all unique values, so no value repeatation, we droped it column. Dropped column name are: Id ------------------------- Now The Data have: 1460 rows 99 columns ------------------------- Added 18 String Features (Extracted from numerical variables) ------------------------- The column <Street> have only one unique value, We droped it from the data. ------------------------- The column <Utilities> have only one unique value, We droped it from the data. ------------------------- The column <Electrical_NA_indicator> have only one unique value, We droped it from the data. ------------------------- The column <PoolQC_NA_indicator> have only one unique value, We droped it from the data. ------------------------- The column <PoolArea_str> have only one unique value, We droped it from the data. ------------------------- <Rare case> catagory: Count Ratio HouseStyle 8.0 0.5479 MasVnrType_NA_indicator 8.0 0.5479 MasVnrArea_NA_indicator 8.0 0.5479 FullBath_str 9.0 0.6164 Foundation 9.0 0.6164 RoofStyle 9.0 0.6164 MiscFeature 9.0 0.6164 Neighborhood 11.0 0.7534 Heating 14.0 0.9589 BedroomAbvGr_str 14.0 0.9589 Condition1 15.0 1.0274 Condition2 15.0 1.0274 RoofMatl 15.0 1.0274 Exterior2nd 17.0 1.1644 3SsnPorch_str 24.0 1.6438 LowQualFinSF_str 26.0 1.7808 SaleType 28.0 1.9178 MiscVal_str 41.0 2.8082 -------------------------
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1151 78.8356 Least frequent 10 0.6849 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 829 56.7808 Least frequent 631 43.2192 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 925 63.3562 Least frequent 10 0.6849 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1311 89.7945 Least frequent 36 2.4658 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1056 72.3288 Least frequent 47 3.2192 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1382 94.6575 Least frequent 13 0.8904 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 24 1.6438 Len 1460 NaN NA 0 0.0000 Most frequent 225 15.4110 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 1260 86.3014 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1445 98.9726 Least frequent 15 1.0274 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1220 83.5616 Least frequent 31 2.1233 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 726 49.7260 Least frequent 8 0.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1141 78.1507 Least frequent 9 0.6164 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 522 35.7534 Least frequent 20 1.3699 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 504 34.5205 Least frequent 10 0.6849 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 864 59.1781 Least frequent 15 1.0274 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 906 62.0548 Least frequent 14 0.9589 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1286 88.0822 Least frequent 28 1.9178 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 647 44.3151 Least frequent 9 0.6164 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 663 45.4110 Least frequent 52 3.5616 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1335 91.4384 Least frequent 60 4.1096 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 955 65.4110 Least frequent 116 7.9452 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 467 31.9863 Least frequent 74 5.0685 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 1293 88.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1428 97.8082 Least frequent 14 0.9589 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 742 50.8219 Least frequent 49 3.3562 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1365 93.4932 Least frequent 95 6.5068 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1339 91.7123 Least frequent 27 1.8493 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 735 50.3425 Least frequent 39 2.6712 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1366 93.5616 Least frequent 14 0.9589 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 660 45.2055 Least frequent 39 2.6712 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 909 62.2603 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 676 46.3014 Least frequent 354 24.2466 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1331 91.1644 Least frequent 15 1.0274 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1376 94.2466 Least frequent 10 0.6849 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1340 91.7808 Least frequent 30 2.0548 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.000 Nunique 2 0.137 Len 1460 NaN NA 0 0.000 Most frequent 870 59.589 Least frequent 590 40.411 Values occured only once 0 0.000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 594 40.6849 Least frequent 38 2.6027 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 890 60.9589 Least frequent 9 0.6164 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 1267 86.7808 Least frequent 28 1.9178 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 1202 82.3288 Least frequent 12 0.8219 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1201 82.2603 Least frequent 259 17.7397 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1369 93.7671 Least frequent 91 6.2329 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <MasVnrArea_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <MasVnrType_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1452 99.4521 Least frequent 8 0.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtFinType1_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <BsmtQual_NA_indicator> column ------------------------- This Columns is duplicate of <BsmtCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1423 97.4658 Least frequent 37 2.5342 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1422 97.3973 Least frequent 38 2.6027 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 770 52.7397 Least frequent 690 47.2603 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageCond_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <GarageType_NA_indicator> column ------------------------- This Columns is duplicate of <GarageYrBlt_NA_indicator> column ------------------------- This Columns is duplicate of <GarageFinish_NA_indicator> column ------------------------- This Columns is duplicate of <GarageQual_NA_indicator> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1379 94.4521 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1179 80.7534 Least frequent 281 19.2466 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1406 96.3014 Least frequent 54 3.6986 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 14 0.9589 Len 1460 NaN NA 0 0.0000 Most frequent 540 36.9863 Least frequent 10 0.6849 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 8 0.5479 Len 1460 NaN NA 0 0.0000 Most frequent 402 27.5342 Least frequent 18 1.2329 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 7 0.4795 Len 1460 NaN NA 0 0.0000 Most frequent 827 56.6438 Least frequent 22 1.5068 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1434 98.2192 Least frequent 26 1.7808 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 857 58.6986 Least frequent 15 1.0274 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1380 94.5205 Least frequent 80 5.4795 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <FullBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 768 52.6027 Least frequent 9 0.6164 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <HalfBath> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 913 62.5342 Least frequent 12 0.8219 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 6 0.4110 Len 1460 NaN NA 0 0.0000 Most frequent 804 55.0685 Least frequent 14 0.9589 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1395 95.5479 Least frequent 65 4.4521 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 10 0.6849 Len 1460 NaN NA 0 0.0000 Most frequent 404 27.6712 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 695 47.6027 Least frequent 115 7.8767 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 4 0.2740 Len 1460 NaN NA 0 0.0000 Most frequent 829 56.7808 Least frequent 81 5.5479 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 2 0.1370 Len 1460 NaN NA 0 0.0000 Most frequent 1436 98.3562 Least frequent 24 1.6438 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 3 0.2055 Len 1460 NaN NA 0 0.0000 Most frequent 1408 96.4384 Least frequent 11 0.7534 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <MoSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 12 0.8219 Len 1460 NaN NA 0 0.0000 Most frequent 253 17.3288 Least frequent 52 3.5616 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <YrSold> column Column Type : Object ------------------------- Counts Ratio Count 1460 100.0000 Nunique 5 0.3425 Len 1460 NaN NA 0 0.0000 Most frequent 338 23.1507 Least frequent 175 11.9863 Values occured only once 0 0.0000
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.4076567471495591)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 56.9 NaN
Std 42.3 NaN
Min 20.0 NaN
25% 20.0 NaN
50% 50.0 NaN
75% 70.0 NaN
Max 190.0 NaN
Nunique 15.0 1.03
Outlies 30.0 2.05
Nagetive 0.0 0.00
Zeros 0.0 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.0120008521763144)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 70.75 NaN
Std 23.47 NaN
Min 21.00 NaN
25% 60.00 NaN
50% 70.00 NaN
75% 80.00 NaN
Max 313.00 NaN
Nunique 224.00 15.34
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 12.207687851233496)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 10516.83 NaN
Std 9981.26 NaN
Min 1300.00 NaN
25% 7553.50 NaN
50% 9478.50 NaN
75% 11601.50 NaN
Max 215245.00 NaN
Nunique 1073.00 73.49
Outlies 13.00 0.89
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.2169439277628693)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.10 NaN
Std 1.38 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 10.00 NaN
Nunique 10.00 0.68
Outlies 2.00 0.14
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6930674724842182)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.58 NaN
Std 1.11 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 5.00 NaN
75% 6.00 NaN
Max 9.00 NaN
Nunique 9.00 0.62
Outlies 28.00 1.92
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.613461172488183)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1971.27 NaN
Std 30.20 NaN
Min 1872.00 NaN
25% 1954.00 NaN
50% 1973.00 NaN
75% 2000.00 NaN
Max 2010.00 NaN
Nunique 112.00 7.67
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.5035620027004709)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1984.87 NaN
Std 20.65 NaN
Min 1950.00 NaN
25% 1967.00 NaN
50% 1994.00 NaN
75% 2004.00 NaN
Max 2010.00 NaN
Nunique 61.00 4.18
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.6682455485578593)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 103.84 NaN
Std 180.74 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 166.00 NaN
Max 1600.00 NaN
Nunique 335.00 22.95
Outlies 32.00 2.19
Nagetive 0.00 0.00
Zeros 861.00 58.97
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.685503071910789)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 443.64 NaN
Std 456.10 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 383.50 NaN
75% 712.25 NaN
Max 5644.00 NaN
Nunique 637.00 43.63
Outlies 6.00 0.41
Nagetive 0.00 0.00
Zeros 467.00 31.99
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.255261108933303)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.55 NaN
Std 161.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 1474.00 NaN
Nunique 144.00 9.86
Outlies 50.00 3.42
Nagetive 0.00 0.00
Zeros 1293.00 88.56
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.9202684528039037)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 567.24 NaN
Std 441.87 NaN
Min 0.00 NaN
25% 223.00 NaN
50% 477.50 NaN
75% 808.00 NaN
Max 2336.00 NaN
Nunique 780.00 53.42
Outlies 11.00 0.75
Nagetive 0.00 0.00
Zeros 118.00 8.08
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5242545490627664)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1057.43 NaN
Std 438.71 NaN
Min 0.00 NaN
25% 795.75 NaN
50% 991.50 NaN
75% 1298.25 NaN
Max 6110.00 NaN
Nunique 721.00 49.38
Outlies 10.00 0.68
Nagetive 0.00 0.00
Zeros 37.00 2.53
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3767566220336365)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1162.63 NaN
Std 386.59 NaN
Min 334.00 NaN
25% 882.00 NaN
50% 1087.00 NaN
75% 1391.25 NaN
Max 4692.00 NaN
Nunique 753.00 51.58
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.8130298163023265)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 346.99 NaN
Std 436.53 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 728.00 NaN
Max 2065.00 NaN
Nunique 417.00 28.56
Outlies 4.00 0.27
Nagetive 0.00 0.00
Zeros 829.00 56.78
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 9.011341288465387)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 5.84 NaN
Std 48.62 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 572.00 NaN
Nunique 24.00 1.64
Outlies 20.00 1.37
Nagetive 0.00 0.00
Zeros 1434.00 98.22
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.3665603560164552)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1515.46 NaN
Std 525.48 NaN
Min 334.00 NaN
25% 1129.50 NaN
50% 1464.00 NaN
75% 1776.75 NaN
Max 5642.00 NaN
Nunique 861.00 58.97
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.596066609663168)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.43 NaN
Std 0.52 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 16.00 1.10
Nagetive 0.00 0.00
Zeros 856.00 58.63
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.103402697955168)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.06 NaN
Std 0.24 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 82.00 5.62
Nagetive 0.00 0.00
Zeros 1378.00 94.38
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <FullBath_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.036561558402727165)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.57 NaN
Std 0.55 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 9.00 0.62
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <HalfBath_str> column Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.675897448233722)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.38 NaN
Std 0.50 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 1.00 NaN
Max 2.00 NaN
Nunique 3.00 0.21
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 913.00 62.53
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21179009627507137)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.87 NaN
Std 0.82 NaN
Min 0.00 NaN
25% 2.00 NaN
50% 3.00 NaN
75% 3.00 NaN
Max 8.00 NaN
Nunique 8.00 0.55
Outlies 14.00 0.96
Nagetive 0.00 0.00
Zeros 6.00 0.41
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.488396777072859)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.05 NaN
Std 0.22 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 68.00 4.66
Nagetive 0.00 0.00
Zeros 1.00 0.07
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6763408364355531)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.52 NaN
Std 1.63 NaN
Min 2.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 7.00 NaN
Max 14.00 NaN
Nunique 12.00 0.82
Outlies 12.00 0.82
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: 0.6495651830548841)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 0.61 NaN
Std 0.64 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 1.00 NaN
75% 1.00 NaN
Max 3.00 NaN
Nunique 4.00 0.27
Outlies 5.00 0.34
Nagetive 0.00 0.00
Zeros 690.00 47.26
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are moderately skewed (skewness is: -0.541264504372725)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1977.23 NaN
Std 24.78 NaN
Min 1900.00 NaN
25% 1960.00 NaN
50% 1978.00 NaN
75% 2001.00 NaN
Max 2010.00 NaN
Nunique 148.00 10.14
Outlies 1.00 0.07
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: -0.3425489297486655)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 1.77 NaN
Std 0.75 NaN
Min 0.00 NaN
25% 1.00 NaN
50% 2.00 NaN
75% 2.00 NaN
Max 4.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 81.00 5.55
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.17998090674623907)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 472.98 NaN
Std 213.80 NaN
Min 0.00 NaN
25% 334.50 NaN
50% 480.00 NaN
75% 576.00 NaN
Max 1418.00 NaN
Nunique 441.00 30.21
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 81.00 5.55
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.5413757571931312)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 94.24 NaN
Std 125.34 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 168.00 NaN
Max 857.00 NaN
Nunique 274.00 18.77
Outlies 22.00 1.51
Nagetive 0.00 0.00
Zeros 761.00 52.12
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 2.3643417403694404)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 46.66 NaN
Std 66.26 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 25.00 NaN
75% 68.00 NaN
Max 547.00 NaN
Nunique 202.00 13.84
Outlies 27.00 1.85
Nagetive 0.00 0.00
Zeros 656.00 44.93
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 3.08987190371177)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 21.95 NaN
Std 61.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 552.00 NaN
Nunique 120.00 8.22
Outlies 51.00 3.49
Nagetive 0.00 0.00
Zeros 1252.00 85.75
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 10.304342032693112)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 3.41 NaN
Std 29.32 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 508.00 NaN
Nunique 20.00 1.37
Outlies 23.00 1.58
Nagetive 0.00 0.00
Zeros 1436.00 98.36
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 4.122213743143115)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 15.06 NaN
Std 55.76 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 480.00 NaN
Nunique 76.00 5.21
Outlies 55.00 3.77
Nagetive 0.00 0.00
Zeros 1344.00 92.05
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 14.828373640750588)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2.76 NaN
Std 40.18 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 738.00 NaN
Nunique 8.00 0.55
Outlies 7.00 0.48
Nagetive 0.00 0.00
Zeros 1453.00 99.52
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 24.476794188821916)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 43.49 NaN
Std 496.12 NaN
Min 0.00 NaN
25% 0.00 NaN
50% 0.00 NaN
75% 0.00 NaN
Max 15500.00 NaN
Nunique 21.00 1.44
Outlies 8.00 0.55
Nagetive 0.00 0.00
Zeros 1408.00 96.44
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <MoSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.21205298505146022)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 6.32 NaN
Std 2.70 NaN
Min 1.00 NaN
25% 5.00 NaN
50% 6.00 NaN
75% 8.00 NaN
Max 12.00 NaN
Nunique 12.00 0.82
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= ------------------------- This Columns is duplicate of <YrSold_str> column Column Type : Number -------------------------
-------------------------
The data is fairly symmetrical (skewness is: 0.09626851386568028)
-------------------------
Count Ratio
Count 1460.00 100.00
NA 0.00 0.00
Mean 2007.82 NaN
Std 1.33 NaN
Min 2006.00 NaN
25% 2007.00 NaN
50% 2008.00 NaN
75% 2009.00 NaN
Max 2010.00 NaN
Nunique 5.00 0.34
Outlies 0.00 0.00
Nagetive 0.00 0.00
Zeros 0.00 0.00
f ========================================= {column_name} ========================================= Column Type : Number -------------------------
-------------------------
The data are highly skewed (skewness is: 1.8828757597682129)
Note: When skewness exceed |1| we called it highly skewed
-------------------------
Count Ratio
Count 1460.0 100.00
NA 0.0 0.00
Mean 180921.2 NaN
Std 79442.5 NaN
Min 34900.0 NaN
25% 129975.0 NaN
50% 163000.0 NaN
75% 214000.0 NaN
Max 755000.0 NaN
Nunique 663.0 45.41
Outlies 22.0 1.51
Nagetive 0.0 0.00
Zeros 0.0 0.00
---------------------------------------------------------------------------------------------- ****************************************** Modeling ****************************************** -------------------- This is Regression problem -------------------- '''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''' ------------------------------------- Linear Regression ------------------------------------- ------------------------- NOTE: This summary saved as <summary_OLS_1.csv> ------------------------- Variable coef std err t P>|t| [0.025 0.975] Indicator 0 1stFlrSF -4.3413 12.481 -0.348 0.728 -28.839 20.157 1 2ndFlrSF 5.8851 12.053 0.488 0.625 -17.773 29.543 2 3SsnPorch 37.2694 61.822 0.603 0.547 -84.076 158.615 3 3SsnPorch_str__"0.0" -2079.8137 7551.898 -0.275 0.783 -16900.000 12700.000 4 3SsnPorch_str__Rare cases 2074.9696 7551.879 0.275 0.784 -12700.000 16900.000 5 Alley_NA_indicator__Missing 3495.5872 2376.485 1.471 0.142 -1169.064 8160.239 6 Alley_NA_indicator__Not missing -3500.4314 2376.529 -1.473 0.141 -8165.170 1164.307 7 Alley__Grvl -1259.8442 1572.476 -0.801 0.423 -4346.357 1826.669 8 Alley__Pave 1255.0001 1572.245 0.798 0.425 -1831.059 4341.059 9 BedroomAbvGr -1310.4123 4435.130 -0.295 0.768 -10000.000 7395.024 10 BedroomAbvGr_str__"1.0" 5672.4422 10400.000 0.548 0.584 -14700.000 26000.000 11 BedroomAbvGr_str__"2.0" 1798.4046 6077.876 0.296 0.767 -10100.000 13700.000 12 BedroomAbvGr_str__"3.0" -2616.3816 3408.673 -0.768 0.443 -9307.049 4074.286 13 BedroomAbvGr_str__"4.0" 2691.6555 4775.554 0.564 0.573 -6681.976 12100.000 14 BedroomAbvGr_str__"5.0" -11650.0000 10500.000 -1.109 0.268 -32300.000 8964.596 15 BedroomAbvGr_str__Rare cases 4094.2179 10500.000 0.391 0.696 -16400.000 24600.000 16 BldgType__1Fam -6570.7722 13500.000 -0.487 0.626 -33000.000 19900.000 17 BldgType__2fmCon 9358.5009 27300.000 0.342 0.732 -44300.000 63000.000 18 BldgType__Duplex -9159.0609 6250.601 -1.465 0.143 -21400.000 3109.847 19 BldgType__Twnhs 1802.8162 14900.000 0.121 0.903 -27400.000 31000.000 20 BldgType__TwnhsE 4563.6718 13700.000 0.332 0.740 -22400.000 31500.000 21 BsmtCond_NA_indicator__Missing -3701.3730 4229.913 -0.875 0.382 -12000.000 4601.255 22 BsmtCond_NA_indicator__Not missing 3696.5288 4229.779 0.874 0.382 -4605.836 12000.000 23 BsmtCond__Fa -5347.3105 4084.021 -1.309 0.191 -13400.000 2668.955 24 BsmtCond__Gd 3030.9907 3450.088 0.879 0.380 -3740.968 9802.950 25 BsmtCond__TA 2311.4756 2478.842 0.932 0.351 -2554.086 7177.037 26 BsmtExposure_NA_indicator__Missing -3701.3730 4229.913 -0.875 0.382 -12000.000 4601.255 27 BsmtExposure_NA_indicator__Not missing 3696.5288 4229.779 0.874 0.382 -4605.836 12000.000 28 BsmtExposure__Av -6207.6649 2190.969 -2.833 0.005 -10500.000 -1907.151 ** 29 BsmtExposure__Gd 17660.0000 2984.114 5.917 0.000 11800.000 23500.000 *** 30 BsmtExposure__Mn -3695.9078 2705.070 -1.366 0.172 -9005.518 1613.702 31 BsmtExposure__No -7758.9521 1861.821 -4.167 0.000 -11400.000 -4104.501 *** 32 BsmtFinSF1 2.2682 3.832 0.592 0.554 -5.254 9.791 33 BsmtFinSF2 7.6862 8.343 0.921 0.357 -8.689 24.061 34 BsmtFinType1_NA_indicator__Missing -3701.3730 4229.913 -0.875 0.382 -12000.000 4601.255 35 BsmtFinType1_NA_indicator__Not missing 3696.5288 4229.779 0.874 0.382 -4605.836 12000.000 36 BsmtFinType1__ALQ 298.1800 2456.050 0.121 0.903 -4522.644 5119.004 37 BsmtFinType1__BLQ 1895.4817 2756.897 0.688 0.492 -3515.856 7306.820 38 BsmtFinType1__GLQ 4342.6084 2648.201 1.640 0.101 -855.377 9540.594 39 BsmtFinType1__LwQ -5220.3497 4048.687 -1.289 0.198 -13200.000 2726.562 40 BsmtFinType1__Rec -1696.8173 3023.496 -0.561 0.575 -7631.446 4237.811 41 BsmtFinType1__Unf 376.0527 2777.093 0.135 0.892 -5074.928 5827.033 42 BsmtFinType2_NA_indicator__Missing 6910.1842 14600.000 0.474 0.636 -21700.000 35500.000 43 BsmtFinType2_NA_indicator__Not missing -6915.0283 14600.000 -0.474 0.636 -35500.000 21700.000 44 BsmtFinType2__ALQ 3665.4667 6987.331 0.525 0.600 -10000.000 17400.000 45 BsmtFinType2__BLQ 2888.6273 5567.056 0.519 0.604 -8038.595 13800.000 46 BsmtFinType2__GLQ -3756.6246 9105.309 -0.413 0.680 -21600.000 14100.000 47 BsmtFinType2__LwQ -2871.1734 5283.429 -0.543 0.587 -13200.000 7499.334 48 BsmtFinType2__Rec -4915.4802 4756.905 -1.033 0.302 -14300.000 4421.546 49 BsmtFinType2__Unf 4984.3400 5164.718 0.965 0.335 -5153.158 15100.000 50 BsmtFullBath 14820.0000 11700.000 1.270 0.205 -8094.779 37700.000 51 BsmtFullBath_str__"0.0" 16570.0000 12100.000 1.364 0.173 -7281.817 40400.000 52 BsmtFullBath_str__"1.0" 6780.8522 4060.562 1.670 0.095 -1189.367 14800.000 . 53 BsmtFullBath_str__"2.0" -23350.0000 13900.000 -1.684 0.093 -50600.000 3867.778 . 54 BsmtHalfBath -53.0447 19200.000 -0.003 0.998 -37700.000 37600.000 55 BsmtHalfBath_str__"0.0" -3765.1222 9849.549 -0.382 0.702 -23100.000 15600.000 56 BsmtHalfBath_str__"1.0" 3760.2781 9849.521 0.382 0.703 -15600.000 23100.000 57 BsmtQual_NA_indicator__Missing -3701.3730 4229.913 -0.875 0.382 -12000.000 4601.255 58 BsmtQual_NA_indicator__Not missing 3696.5288 4229.779 0.874 0.382 -4605.836 12000.000 59 BsmtQual__Ex 12650.0000 4262.020 2.968 0.003 4285.840 21000.000 ** 60 BsmtQual__Fa -3987.1200 5118.332 -0.779 0.436 -14000.000 6059.329 61 BsmtQual__Gd -4065.7863 2610.048 -1.558 0.120 -9188.884 1057.312 62 BsmtQual__TA -4603.4274 2633.496 -1.748 0.081 -9772.551 565.696 . 63 BsmtUnfSF -2.2817 4.677 -0.488 0.626 -11.462 6.899 64 CentralAir__N -912.8214 2737.458 -0.333 0.739 -6286.004 4460.362 65 CentralAir__Y 907.9772 2737.455 0.332 0.740 -4465.199 6281.153 66 Condition1__Artery -961.7133 5707.105 -0.169 0.866 -12200.000 10200.000 67 Condition1__Feedr -3844.9486 4237.316 -0.907 0.364 -12200.000 4472.212 68 Condition1__Norm 6771.8850 2816.167 2.405 0.016 1244.210 12300.000 * 69 Condition1__PosN -13940.0000 7854.404 -1.775 0.076 -29400.000 1472.366 . 70 Condition1__RRAe -18500.0000 8460.254 -2.187 0.029 -35100.000 -1893.172 * 71 Condition1__RRAn 16410.0000 6147.979 2.669 0.008 4343.348 28500.000 ** 72 Condition1__Rare cases 14060.0000 7799.400 1.803 0.072 -1246.027 29400.000 . 73 Condition2__Norm 22250.0000 5447.705 4.085 0.000 11600.000 32900.000 *** 74 Condition2__Rare cases -22260.0000 5447.824 -4.085 0.000 -32900.000 -11600.000 *** 75 Electrical__FuseA -1759.5466 3478.963 -0.506 0.613 -8588.182 5069.089 76 Electrical__FuseF 6884.2751 5210.653 1.321 0.187 -3343.385 17100.000 77 Electrical__SBrkr -5129.5727 3263.721 -1.572 0.116 -11500.000 1276.579 78 EnclosedPorch 31.1905 17.278 1.805 0.071 -2.724 65.104 . 79 ExterCond__Fa 1717.2756 5284.605 0.325 0.745 -8655.542 12100.000 80 ExterCond__Gd -2627.5526 3380.791 -0.777 0.437 -9263.492 4008.387 81 ExterCond__TA 905.4329 2830.684 0.320 0.749 -4650.736 6461.602 82 ExterQual__Ex 13300.0000 6988.705 1.903 0.057 -415.524 27000.000 . 83 ExterQual__Fa -25.9986 11100.000 -0.002 0.998 -21800.000 21800.000 84 ExterQual__Gd -5271.3592 4462.546 -1.181 0.238 -14000.000 3487.891 85 ExterQual__TA -8009.6483 4306.193 -1.860 0.063 -16500.000 442.705 . 86 Exterior1st__AsbShng 19440.0000 15600.000 1.242 0.214 -11300.000 50200.000 87 Exterior1st__BrkFace 11630.0000 7446.038 1.562 0.119 -2987.712 26200.000 88 Exterior1st__CemntBd -20140.0000 19500.000 -1.035 0.301 -58300.000 18000.000 89 Exterior1st__HdBoard 4624.9870 6189.838 0.747 0.455 -7524.654 16800.000 90 Exterior1st__MetalSd -3432.5884 9580.418 -0.358 0.720 -22200.000 15400.000 91 Exterior1st__Plywood 9235.7424 6299.837 1.466 0.143 -3129.808 21600.000 92 Exterior1st__Stucco -20670.0000 11900.000 -1.732 0.084 -44100.000 2755.309 . 93 Exterior1st__VinylSd -2391.7225 7648.713 -0.313 0.755 -17400.000 12600.000 94 Exterior1st__Wd Sdng -2117.8927 5784.309 -0.366 0.714 -13500.000 9235.760 95 Exterior1st__WdShing 3813.3503 8581.103 0.444 0.657 -13000.000 20700.000 96 Exterior2nd__AsbShng -19320.0000 15500.000 -1.243 0.214 -49800.000 11200.000 97 Exterior2nd__BrkFace 445.0308 9143.817 0.049 0.961 -17500.000 18400.000 98 Exterior2nd__CmentBd 17590.0000 20300.000 0.866 0.387 -22300.000 57400.000 99 Exterior2nd__HdBoard -2736.2205 6022.968 -0.454 0.650 -14600.000 9085.882 100 Exterior2nd__ImStucc -14130.0000 9621.497 -1.469 0.142 -33000.000 4755.580 101 Exterior2nd__MetalSd 1690.7938 9899.988 0.171 0.864 -17700.000 21100.000 102 Exterior2nd__Plywood -4016.3164 5575.661 -0.720 0.472 -15000.000 6927.795 103 Exterior2nd__Rare cases -6568.0191 9745.793 -0.674 0.501 -25700.000 12600.000 104 Exterior2nd__Stucco 18070.0000 12100.000 1.491 0.136 -5720.569 41900.000 105 Exterior2nd__VinylSd 7464.7050 7210.296 1.035 0.301 -6687.928 21600.000 106 Exterior2nd__Wd Sdng 4530.7551 5654.631 0.801 0.423 -6568.362 15600.000 107 Exterior2nd__Wd Shng -3025.2082 7322.541 -0.413 0.680 -17400.000 11300.000 108 Fence_NA_indicator__Missing -1043.1026 1231.793 -0.847 0.397 -3460.912 1374.707 109 Fence_NA_indicator__Not missing 1038.2585 1231.867 0.843 0.400 -1379.696 3456.212 110 Fence__GdPrv -281.0904 2269.067 -0.124 0.901 -4734.898 4172.717 111 Fence__GdWo -226.8164 2262.290 -0.100 0.920 -4667.321 4213.689 112 Fence__MnPrv 1298.0768 2043.669 0.635 0.525 -2713.313 5309.466 113 Fence__MnWw -795.0142 4630.464 -0.172 0.864 -9883.859 8293.831 114 FireplaceQu_NA_indicator__Missing -558.8191 476.744 -1.172 0.241 -1494.591 376.952 115 FireplaceQu_NA_indicator__Not missing 553.9750 476.620 1.162 0.245 -381.552 1489.502 116 FireplaceQu__Ex 5391.1276 5134.033 1.050 0.294 -4686.141 15500.000 117 FireplaceQu__Fa -300.1210 2956.550 -0.102 0.919 -6103.345 5503.103 118 FireplaceQu__Gd -1306.5021 2067.934 -0.632 0.528 -5365.519 2752.515 119 FireplaceQu__Po -206.9138 3601.468 -0.057 0.954 -7276.007 6862.179 120 FireplaceQu__TA -3582.4348 2144.271 -1.671 0.095 -7791.288 626.419 . 121 Fireplaces -17430.0000 5393.602 -3.231 0.001 -28000.000 -6840.035 ** 122 Fireplaces_str__"0.0" -22070.0000 5403.495 -4.085 0.000 -32700.000 -11500.000 *** 123 Fireplaces_str__"1.0" -2977.0931 1411.126 -2.110 0.035 -5746.903 -207.283 * 124 Fireplaces_str__"2.0" 25040.0000 5722.919 4.376 0.000 13800.000 36300.000 *** 125 Foundation__BrkTil -21.9775 4891.709 -0.004 0.996 -9623.603 9579.648 126 Foundation__CBlock 4676.1991 3911.212 1.196 0.232 -3000.871 12400.000 127 Foundation__PConc 6893.7130 4085.783 1.687 0.092 -1126.012 14900.000 . 128 Foundation__Rare cases -22360.0000 10400.000 -2.158 0.031 -42700.000 -2022.010 * 129 Foundation__Slab 10810.0000 10000.000 1.076 0.282 -8902.879 30500.000 130 FullBath 7655.7680 5728.657 1.336 0.182 -3588.650 18900.000 131 FullBath_str__"1.0" -6310.2331 7805.781 -0.808 0.419 -21600.000 9011.240 132 FullBath_str__"2.0" -7587.7706 3964.744 -1.914 0.056 -15400.000 194.374 . 133 FullBath_str__"3.0" 9713.8474 3789.400 2.563 0.011 2275.874 17200.000 * 134 FullBath_str__Rare cases 4179.3121 7797.121 0.536 0.592 -11100.000 19500.000 135 Functional__Maj1 -12040.0000 7937.842 -1.516 0.130 -27600.000 3545.128 136 Functional__Min1 4978.5878 6240.879 0.798 0.425 -7271.238 17200.000 137 Functional__Min2 4900.9551 5480.331 0.894 0.371 -5856.039 15700.000 138 Functional__Mod -9515.1811 8057.106 -1.181 0.238 -25300.000 6299.602 139 Functional__Typ 11670.0000 3592.836 3.247 0.001 4614.204 18700.000 ** 140 GarageArea 11.3343 10.806 1.049 0.295 -9.876 32.544 141 GarageCars 7118.5604 8957.390 0.795 0.427 -10500.000 24700.000 142 GarageCars_str__"0.0" 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 143 GarageCars_str__"1.0" -1873.6670 8690.287 -0.216 0.829 -18900.000 15200.000 144 GarageCars_str__"2.0" -4667.6933 1772.594 -2.633 0.009 -8147.006 -1188.380 ** 145 GarageCars_str__"3.0" 5856.1126 9739.402 0.601 0.548 -13300.000 25000.000 146 GarageCond_NA_indicator__Missing 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 147 GarageCond_NA_indicator__Not missing -685.2476 1646.464 -0.416 0.677 -3916.988 2546.493 148 GarageCond__Fa -133.7145 6514.937 -0.021 0.984 -12900.000 12700.000 149 GarageCond__Gd -3771.7805 9538.061 -0.395 0.693 -22500.000 14900.000 150 GarageCond__Po 339.2597 13600.000 0.025 0.980 -26400.000 27100.000 151 GarageCond__TA 3561.3912 5715.355 0.623 0.533 -7656.916 14800.000 152 GarageFinish_NA_indicator__Missing 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 153 GarageFinish_NA_indicator__Not missing -685.2476 1646.464 -0.416 0.677 -3916.988 2546.493 154 GarageFinish__Fin 3738.3343 1692.317 2.209 0.027 416.593 7060.075 * 155 GarageFinish__RFn -458.4012 1471.812 -0.311 0.756 -3347.327 2430.524 156 GarageFinish__Unf -3284.7773 1792.833 -1.832 0.067 -6803.816 234.262 . 157 GarageQual_NA_indicator__Missing 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 158 GarageQual_NA_indicator__Not missing -685.2476 1646.464 -0.416 0.677 -3916.988 2546.493 159 GarageQual__Fa -5345.1643 5470.095 -0.977 0.329 -16100.000 5391.738 160 GarageQual__Gd 6651.4683 7537.120 0.882 0.378 -8142.668 21400.000 161 GarageQual__TA -1311.1481 4107.718 -0.319 0.750 -9373.928 6751.631 162 GarageType_NA_indicator__Missing 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 163 GarageType_NA_indicator__Not missing -685.2476 1646.464 -0.416 0.677 -3916.988 2546.493 164 GarageType__Attchd -2061.8625 3429.870 -0.601 0.548 -8794.137 4670.412 165 GarageType__Basment 3912.7792 7254.331 0.539 0.590 -10300.000 18200.000 166 GarageType__BuiltIn -11090.0000 4816.018 -2.302 0.022 -20500.000 -1635.359 * 167 GarageType__CarPort 6667.5872 10100.000 0.660 0.509 -13200.000 26500.000 168 GarageType__Detchd 2565.0667 3586.470 0.715 0.475 -4474.587 9604.721 169 GarageYrBlt -78.1122 85.236 -0.916 0.360 -245.416 89.192 170 GarageYrBlt_NA_indicator__Missing 680.4035 1646.418 0.413 0.680 -2551.246 3912.053 171 GarageYrBlt_NA_indicator__Not missing -685.2476 1646.464 -0.416 0.677 -3916.988 2546.493 172 GrLivArea 44.9795 12.304 3.656 0.000 20.828 69.131 *** 173 HalfBath 1375.3034 4992.634 0.275 0.783 -8424.421 11200.000 174 HalfBath_str__"0.0" -1618.7921 1027.159 -1.576 0.115 -3634.937 397.353 175 HalfBath_str__"1.0" 1852.5925 5065.087 0.366 0.715 -8089.346 11800.000 176 HalfBath_str__"2.0" -238.6445 4922.919 -0.048 0.961 -9901.530 9424.241 177 HeatingQC__Ex -85.4541 2301.320 -0.037 0.970 -4602.569 4431.660 178 HeatingQC__Fa 2121.0319 4549.793 0.466 0.641 -6809.468 11100.000 179 HeatingQC__Gd -635.2931 2292.879 -0.277 0.782 -5135.840 3865.254 180 HeatingQC__TA -1405.1288 2145.689 -0.655 0.513 -5616.767 2806.509 181 Heating__GasA 4150.6873 5768.730 0.720 0.472 -7172.387 15500.000 182 Heating__GasW 6590.8025 7041.075 0.936 0.350 -7229.678 20400.000 183 Heating__Rare cases -10750.0000 8280.814 -1.298 0.195 -27000.000 5507.552 184 HouseStyle__1.5Fin 12340.0000 11400.000 1.081 0.280 -10100.000 34700.000 185 HouseStyle__1.5Unf 23660.0000 32500.000 0.728 0.467 -40100.000 87400.000 186 HouseStyle__1Story 18070.0000 9429.892 1.916 0.056 -442.597 36600.000 . 187 HouseStyle__2.5Unf -56030.0000 20100.000 -2.783 0.006 -95500.000 -16500.000 ** 188 HouseStyle__2Story -3668.3682 9861.095 -0.372 0.710 -23000.000 15700.000 189 HouseStyle__Rare cases -61360.0000 21300.000 -2.879 0.004 -103000.000 -19500.000 ** 190 HouseStyle__SFoyer 40870.0000 14400.000 2.832 0.005 12500.000 69200.000 ** 191 HouseStyle__SLvl 26120.0000 15000.000 1.744 0.082 -3277.895 55500.000 . 192 KitchenAbvGr -22560.0000 18400.000 -1.223 0.222 -58700.000 13600.000 193 KitchenAbvGr_str__"1.0" -10480.0000 10300.000 -1.021 0.308 -30600.000 9663.654 194 KitchenAbvGr_str__"2.0" 10470.0000 10300.000 1.021 0.308 -9668.246 30600.000 195 KitchenQual__Ex 13530.0000 4286.411 3.156 0.002 5113.490 21900.000 ** 196 KitchenQual__Fa 167.9946 5120.826 0.033 0.974 -9883.351 10200.000 197 KitchenQual__Gd -5490.9265 2529.325 -2.171 0.030 -10500.000 -526.275 * 198 KitchenQual__TA -8208.9276 2518.350 -3.260 0.001 -13200.000 -3265.819 ** 199 LandContour__Bnk -4735.4850 4365.735 -1.085 0.278 -13300.000 3833.740 200 LandContour__HLS 4036.8129 4449.691 0.907 0.365 -4697.203 12800.000 201 LandContour__Low -4146.6704 5589.500 -0.742 0.458 -15100.000 6824.606 202 LandContour__Lvl 4840.4984 3095.355 1.564 0.118 -1235.178 10900.000 203 LandSlope__Gtl 3531.4813 5587.739 0.632 0.528 -7436.337 14500.000 204 LandSlope__Mod 9833.7261 5331.768 1.844 0.065 -631.664 20300.000 . 205 LandSlope__Sev -13370.0000 9398.259 -1.423 0.155 -31800.000 5077.195 206 LotArea 0.2382 0.157 1.521 0.129 -0.069 0.545 207 LotConfig__Corner -782.7630 2341.935 -0.334 0.738 -5379.599 3814.073 208 LotConfig__CulDSac 8492.8091 3100.784 2.739 0.006 2406.476 14600.000 ** 209 LotConfig__FR2 -7914.6539 4067.475 -1.946 0.052 -15900.000 69.135 . 210 LotConfig__Inside 199.7637 1991.184 0.100 0.920 -3708.605 4108.132 211 LotFrontage -61.3684 59.958 -1.024 0.306 -179.057 56.320 212 LotFrontage_NA_indicator__Missing 361.5045 1322.591 0.273 0.785 -2234.526 2957.535 213 LotFrontage_NA_indicator__Not missing -366.3486 1322.594 -0.277 0.782 -2962.385 2229.687 214 LotShape__IR1 -285.9946 3165.893 -0.090 0.928 -6500.126 5928.137 215 LotShape__IR2 5812.4145 4657.052 1.248 0.212 -3328.618 15000.000 216 LotShape__IR3 -6226.6686 7700.723 -0.809 0.419 -21300.000 8888.593 217 LotShape__Reg 695.4045 3328.262 0.209 0.835 -5837.431 7228.240 218 LowQualFinSF 43.4357 33.537 1.295 0.196 -22.393 109.264 219 LowQualFinSF_str__"0.0" 9482.7561 7380.387 1.285 0.199 -5003.738 24000.000 220 LowQualFinSF_str__Rare cases -9487.6002 7380.361 -1.286 0.199 -24000.000 4998.842 221 MSSubClass 493.8193 976.779 0.506 0.613 -1423.438 2411.076 222 MSSubClass_str__"120.0" -35780.0000 33900.000 -1.055 0.292 -102000.000 30800.000 223 MSSubClass_str__"160.0" -45970.0000 71200.000 -0.645 0.519 -186000.000 93900.000 224 MSSubClass_str__"180.0" -84060.0000 91900.000 -0.915 0.360 -264000.000 96200.000 225 MSSubClass_str__"190.0" -64190.0000 105000.000 -0.614 0.539 -269000.000 141000.000 226 MSSubClass_str__"20.0" 47480.0000 68700.000 0.691 0.490 -87400.000 182000.000 227 MSSubClass_str__"30.0" 33510.0000 60500.000 0.554 0.580 -85200.000 152000.000 228 MSSubClass_str__"45.0" 24510.0000 56700.000 0.432 0.666 -86700.000 136000.000 229 MSSubClass_str__"50.0" 23510.0000 40700.000 0.577 0.564 -56400.000 103000.000 230 MSSubClass_str__"60.0" 25700.0000 30300.000 0.847 0.397 -33800.000 85200.000 231 MSSubClass_str__"70.0" 26020.0000 22600.000 1.150 0.251 -18400.000 70400.000 232 MSSubClass_str__"75.0" 79400.0000 26500.000 3.001 0.003 27500.000 131000.000 ** 233 MSSubClass_str__"80.0" 3265.1889 17700.000 0.184 0.854 -31500.000 38000.000 234 MSSubClass_str__"85.0" -24220.0000 15800.000 -1.538 0.125 -55100.000 6700.558 235 MSSubClass_str__"90.0" -9159.0609 6250.601 -1.465 0.143 -21400.000 3109.847 236 MSZoning__C (all) -8905.7206 13200.000 -0.673 0.501 -34900.000 17100.000 237 MSZoning__FV 15280.0000 8223.752 1.858 0.064 -861.201 31400.000 . 238 MSZoning__RH -5202.4775 8712.739 -0.597 0.551 -22300.000 11900.000 239 MSZoning__RL -320.9281 4630.852 -0.069 0.945 -9410.534 8768.678 240 MSZoning__RM -856.3985 5250.649 -0.163 0.870 -11200.000 9449.768 241 MasVnrArea 21.2898 7.569 2.813 0.005 6.433 36.147 ** 242 MasVnrArea_NA_indicator__Not missing -285.3145 2709.717 -0.105 0.916 -5604.047 5033.418 243 MasVnrArea_NA_indicator__Rare cases 280.4703 2709.737 0.104 0.918 -5038.300 5599.241 244 MasVnrType_NA_indicator__Not missing -285.3145 2709.717 -0.105 0.916 -5604.047 5033.418 245 MasVnrType_NA_indicator__Rare cases 280.4703 2709.737 0.104 0.918 -5038.300 5599.241 246 MasVnrType__BrkCmn -6615.8020 6992.351 -0.946 0.344 -20300.000 7109.041 247 MasVnrType__BrkFace -931.6397 2857.815 -0.326 0.745 -6541.063 4677.784 248 MasVnrType__None 5201.6034 3011.026 1.728 0.084 -708.549 11100.000 . 249 MasVnrType__Stone 2340.9941 3564.531 0.657 0.512 -4655.597 9337.585 250 MiscFeature_NA_indicator__Missing 1180.1962 17100.000 0.069 0.945 -32400.000 34800.000 251 MiscFeature_NA_indicator__Not missing -1185.0404 17100.000 -0.069 0.945 -34800.000 32400.000 252 MiscFeature__Othr 280400.0000 35400.000 7.922 0.000 211000.000 350000.000 *** 253 MiscFeature__Rare cases -557700.0000 70500.000 -7.907 0.000 -696000.000 -419000.000 *** 254 MiscFeature__Shed 277300.0000 35200.000 7.872 0.000 208000.000 346000.000 *** 255 MiscVal 12.1074 10.023 1.208 0.227 -7.566 31.781 256 MiscVal_str__"0.0" 3802.7136 23300.000 0.163 0.871 -42000.000 49600.000 257 MiscVal_str__"400.0" -7832.2406 13600.000 -0.574 0.566 -34600.000 19000.000 258 MiscVal_str__Rare cases 4024.6830 13100.000 0.306 0.759 -21800.000 29800.000 259 MoSold -162.5809 352.020 -0.462 0.644 -853.538 528.376 260 MoSold_str__"1.0" 825.3061 3570.885 0.231 0.817 -6183.758 7834.370 261 MoSold_str__"10.0" -6136.2362 3310.473 -1.854 0.064 -12600.000 361.682 . 262 MoSold_str__"11.0" 2494.8557 3524.744 0.708 0.479 -4423.641 9413.352 263 MoSold_str__"12.0" 721.1909 3496.536 0.206 0.837 -6141.939 7584.321 264 MoSold_str__"2.0" -7494.5687 3800.907 -1.972 0.049 -15000.000 -34.009 * 265 MoSold_str__"3.0" 2719.6670 3037.271 0.895 0.371 -3242.000 8681.334 266 MoSold_str__"4.0" 3089.2057 2847.330 1.085 0.278 -2499.638 8678.049 267 MoSold_str__"5.0" 828.0756 2398.801 0.345 0.730 -3880.379 5536.530 268 MoSold_str__"6.0" 1984.3847 2203.429 0.901 0.368 -2340.587 6309.356 269 MoSold_str__"7.0" 4880.2269 2324.465 2.100 0.036 317.681 9442.773 * 270 MoSold_str__"8.0" -3794.3772 2965.946 -1.279 0.201 -9616.044 2027.289 271 MoSold_str__"9.0" -122.5747 4143.337 -0.030 0.976 -8255.269 8010.120 272 Neighborhood__Blmngtn -5914.0752 9519.548 -0.621 0.535 -24600.000 12800.000 273 Neighborhood__BrDale -2432.6525 12000.000 -0.203 0.839 -26000.000 21100.000 274 Neighborhood__BrkSide -706.0561 6392.617 -0.110 0.912 -13300.000 11800.000 275 Neighborhood__ClearCr 6665.2772 7588.666 0.878 0.380 -8230.034 21600.000 276 Neighborhood__CollgCr 2708.1320 4007.812 0.676 0.499 -5158.548 10600.000 277 Neighborhood__Crawfor 18230.0000 6061.059 3.008 0.003 6334.418 30100.000 ** 278 Neighborhood__Edwards -22530.0000 4064.705 -5.543 0.000 -30500.000 -14600.000 *** 279 Neighborhood__Gilbert -821.7721 5166.817 -0.159 0.874 -11000.000 9319.845 280 Neighborhood__IDOTRR -19280.0000 9037.441 -2.134 0.033 -37000.000 -1545.446 * 281 Neighborhood__MeadowV -9512.1907 12300.000 -0.774 0.439 -33600.000 14600.000 282 Neighborhood__Mitchel -8044.7776 5181.962 -1.552 0.121 -18200.000 2126.567 283 Neighborhood__NAmes -9016.9380 3602.958 -2.503 0.013 -16100.000 -1944.919 * 284 Neighborhood__NWAmes -7408.2526 4826.646 -1.535 0.125 -16900.000 2065.666 285 Neighborhood__NoRidge 37050.0000 6546.908 5.659 0.000 24200.000 49900.000 *** 286 Neighborhood__NridgHt 26830.0000 5617.810 4.775 0.000 15800.000 37900.000 *** 287 Neighborhood__OldTown -16750.0000 6553.809 -2.555 0.011 -29600.000 -3881.914 * 288 Neighborhood__Rare cases 890.5270 12000.000 0.074 0.941 -22700.000 24400.000 289 Neighborhood__SWISU -23890.0000 7993.583 -2.988 0.003 -39600.000 -8196.887 ** 290 Neighborhood__Sawyer -4871.8407 4570.224 -1.066 0.287 -13800.000 4098.762 291 Neighborhood__SawyerW 3795.9915 4834.402 0.785 0.433 -5693.149 13300.000 292 Neighborhood__Somerst 1489.9423 7870.593 0.189 0.850 -14000.000 16900.000 293 Neighborhood__StoneBr 29320.0000 8768.744 3.344 0.001 12100.000 46500.000 ** 294 Neighborhood__Timber -6188.9940 6859.789 -0.902 0.367 -19700.000 7275.651 295 Neighborhood__Veenker 10380.0000 9493.151 1.094 0.274 -8250.729 29000.000 296 OpenPorchSF 1.6354 16.398 0.100 0.921 -30.551 33.821 297 OverallCond 1497.5135 5795.275 0.258 0.796 -9877.665 12900.000 298 OverallCond_str__"3.0" -10340.0000 18000.000 -0.574 0.566 -45700.000 25000.000 299 OverallCond_str__"4.0" -10590.0000 12500.000 -0.848 0.397 -35100.000 13900.000 300 OverallCond_str__"5.0" -1962.7515 6797.464 -0.289 0.773 -15300.000 11400.000 301 OverallCond_str__"6.0" 2373.2526 2950.940 0.804 0.421 -3418.961 8165.467 302 OverallCond_str__"7.0" 4477.0479 6465.453 0.692 0.489 -8213.581 17200.000 303 OverallCond_str__"8.0" 2590.3440 12500.000 0.208 0.835 -21900.000 27000.000 304 OverallCond_str__"9.0" 13440.0000 18700.000 0.718 0.473 -23300.000 50200.000 305 OverallQual 5601.9469 6462.640 0.867 0.386 -7083.159 18300.000 306 OverallQual_str__"10.0" 7465.6789 24700.000 0.302 0.763 -41100.000 56000.000 307 OverallQual_str__"3.0" 13810.0000 24100.000 0.573 0.567 -33500.000 61100.000 308 OverallQual_str__"4.0" -5916.2937 16400.000 -0.360 0.719 -38100.000 26300.000 309 OverallQual_str__"5.0" -10000.0000 10800.000 -0.930 0.352 -31100.000 11100.000 310 OverallQual_str__"6.0" -11690.0000 5114.898 -2.286 0.022 -21700.000 -1654.867 * 311 OverallQual_str__"7.0" -8737.1629 4665.545 -1.873 0.061 -17900.000 420.539 . 312 OverallQual_str__"8.0" 1787.5831 10100.000 0.177 0.859 -18000.000 21600.000 313 OverallQual_str__"9.0" 13280.0000 17500.000 0.760 0.447 -21000.000 47600.000 314 PavedDrive__N 1837.0093 3886.008 0.473 0.637 -5790.590 9464.608 315 PavedDrive__P -1549.7854 4803.762 -0.323 0.747 -11000.000 7879.215 316 PavedDrive__Y -292.0680 3120.131 -0.094 0.925 -6416.376 5832.240 317 PoolArea 1341.8090 175.691 7.637 0.000 996.956 1686.662 *** 318 PoolQC__Fa 786.7950 1418.131 0.555 0.579 -1996.765 3570.355 319 PoolQC__Gd -791.6392 1418.117 -0.558 0.577 -3575.170 1991.892 320 RoofMatl__CompShg 224.5666 7815.854 0.029 0.977 -15100.000 15600.000 321 RoofMatl__Rare cases 25620.0000 8106.762 3.161 0.002 9710.871 41500.000 ** 322 RoofMatl__Tar&Grv -25850.0000 11800.000 -2.188 0.029 -49000.000 -2661.402 * 323 RoofStyle__Flat -1899.6956 14800.000 -0.129 0.898 -30900.000 27100.000 324 RoofStyle__Gable 2573.2021 5599.293 0.460 0.646 -8417.296 13600.000 325 RoofStyle__Gambrel -9681.6685 11200.000 -0.868 0.386 -31600.000 12200.000 326 RoofStyle__Hip 2109.4958 5875.727 0.359 0.720 -9423.597 13600.000 327 RoofStyle__Rare cases 6893.8222 12500.000 0.553 0.580 -17600.000 31300.000 328 SaleCondition__Abnorml 2194.4422 5457.420 0.402 0.688 -8517.581 12900.000 329 SaleCondition__Alloca -11350.0000 11200.000 -1.012 0.312 -33400.000 10700.000 330 SaleCondition__Family -1574.9543 7369.950 -0.214 0.831 -16000.000 12900.000 331 SaleCondition__Normal 8828.3764 4684.507 1.885 0.060 -366.546 18000.000 . 332 SaleCondition__Partial 1897.6217 14100.000 0.134 0.893 -25800.000 29600.000 333 SaleType__COD -9845.4683 6847.459 -1.438 0.151 -23300.000 3594.976 334 SaleType__New 11530.0000 13300.000 0.868 0.386 -14500.000 37600.000 335 SaleType__Rare cases 5998.3642 6785.273 0.884 0.377 -7320.018 19300.000 336 SaleType__WD -7691.3930 5267.584 -1.460 0.145 -18000.000 2648.013 337 ScreenPorch 63.8770 15.742 4.058 0.000 32.978 94.776 *** 338 TotRmsAbvGrd 9411.1697 5112.907 1.841 0.066 -624.632 19400.000 . 339 TotRmsAbvGrd_str__"10.0" -5932.6693 13200.000 -0.450 0.653 -31800.000 19900.000 340 TotRmsAbvGrd_str__"11.0" -45650.0000 18600.000 -2.448 0.015 -82200.000 -9048.481 * 341 TotRmsAbvGrd_str__"12.0" -22610.0000 23800.000 -0.950 0.342 -69300.000 24100.000 342 TotRmsAbvGrd_str__"3.0" 28510.0000 23600.000 1.210 0.227 -17800.000 74800.000 343 TotRmsAbvGrd_str__"4.0" 23510.0000 17500.000 1.346 0.179 -10800.000 57800.000 344 TotRmsAbvGrd_str__"5.0" 15310.0000 12500.000 1.225 0.221 -9227.869 39900.000 345 TotRmsAbvGrd_str__"6.0" 11730.0000 7721.951 1.519 0.129 -3428.812 26900.000 346 TotRmsAbvGrd_str__"7.0" 5178.9634 3701.284 1.399 0.162 -2086.053 12400.000 347 TotRmsAbvGrd_str__"8.0" -2351.4821 4316.603 -0.545 0.586 -10800.000 6121.304 348 TotRmsAbvGrd_str__"9.0" -7706.4971 8632.447 -0.893 0.372 -24700.000 9237.586 349 TotalBsmtSF 7.6727 5.564 1.379 0.168 -3.248 18.594 350 WoodDeckSF 22.3136 8.225 2.713 0.007 6.170 38.457 ** 351 YearBuilt 36.1893 121.956 0.297 0.767 -203.191 275.569 352 YearRemodAdd 154.0372 78.419 1.964 0.050 0.114 307.960 . 353 YrSold -299.4167 154.369 -1.940 0.053 -602.419 3.585 . 354 YrSold_str__"2006.0" -1461.1936 1805.449 -0.809 0.419 -5004.994 2082.607 355 YrSold_str__"2007.0" -1503.7430 1759.694 -0.855 0.393 -4957.736 1950.250 356 YrSold_str__"2008.0" 637.4187 1817.045 0.351 0.726 -2929.143 4203.980 357 YrSold_str__"2009.0" -356.1383 1751.333 -0.203 0.839 -3793.718 3081.442 358 YrSold_str__"2010.0" 2678.8120 2349.664 1.140 0.255 -1933.195 7290.819 ------------------------- --- Model statistic --- R-squared : 0.918 Adj. R-squared : 0.892 F-statistic : 35 Prob (F-statistic): 0.0 No. Observations : 1095 AIC : 25561 Df Residuals : 827 BIC : 26900 RMSE (test) : 62323 ------------------------- Maximum correlation between Reseduals and any data columns is 1.999455839599218e-12, with columns <LotArea> Mean of train reseduals: -2.5779030128428924e-08 ------------------------------------- Random Forest ------------------------------------- ------------------------- RF model peramters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 200, 'n_jobs': None, 'oob_score': True, 'random_state': None, 'verbose': 0, 'warm_start': False} -------------------------
-------------------------
--- Model statistic ---
R^2 (test) : 0.8853590416339046
R^2 (train): 0.9797012301701824
RMSE (test): 28613
oob score : 0.8474266006004882
-------------------------
Maximum correlation between Reseduals and any data columns is 0.4524305756076382, with columns <Exterior2nd__ImStucc>